From e571717e06667228ec8d689be067e00bdd06d34d Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Thu, 12 Dec 2013 10:45:07 +1100 Subject: [PATCH] Plural rules: updates for UTS #35 Rev 33 * New operands i, v, w, f, t * New operators =, !=, % * Ignore "samples", which are basically unit tests embedded in rule specifications * Ignore the new "other" rules, which have an empty condition. It doesn't really makes sense to parse them, since the empty condition means special handling should be done in the caller, it is not equivalent to an unconditional true or false. * Trailing zero support requires that the input number be a string. Documented this. * Fixed some comments * Added test cases for new features Bug: 56931 Change-Id: I96986c0c664f785e75b0a4ced2ec9e37b72681c1 --- includes/cache/LocalisationCache.php | 4 + languages/utils/CLDRPluralRuleEvaluator.php | 106 ++++++++++++++---- .../utils/CLDRPluralRuleEvaluatorTest.php | 50 +++++++++ 3 files changed, 136 insertions(+), 24 deletions(-) diff --git a/includes/cache/LocalisationCache.php b/includes/cache/LocalisationCache.php index ccb94a2c5b..e18c64b0e5 100644 --- a/includes/cache/LocalisationCache.php +++ b/includes/cache/LocalisationCache.php @@ -609,6 +609,10 @@ class LocalisationCache { $ruleElements = $ruleset->getElementsByTagName( "pluralRule" ); foreach ( $ruleElements as $elt ) { $ruleType = $elt->getAttribute( 'count' ); + if ( $ruleType === 'other' ) { + // Don't record "other" rules, which have an empty condition + continue; + } $rules[] = $elt->nodeValue; $ruleTypes[] = $ruleType; } diff --git a/languages/utils/CLDRPluralRuleEvaluator.php b/languages/utils/CLDRPluralRuleEvaluator.php index c2aede0249..1f4d0cde4a 100644 --- a/languages/utils/CLDRPluralRuleEvaluator.php +++ b/languages/utils/CLDRPluralRuleEvaluator.php @@ -2,7 +2,8 @@ /** * Parse and evaluate a plural rule. * - * http://unicode.org/reports/tr35/#Language_Plural_Rules + * UTS #35 Revision 33 + * http://www.unicode.org/reports/tr35/tr35-33/tr35-numbers.html#Language_Plural_Rules * * @author Niklas Laxstrom, Tim Starling * @@ -63,11 +64,41 @@ class CLDRPluralRuleEvaluator { * Evaluate a compiled set of rules returned by compile(). Do not allow * the user to edit the compiled form, or else PHP errors may result. * - * @param int The number to be evaluated against the rules + * @param string The number to be evaluated against the rules, in English, or it + * may be a type convertible to string. * @param array The associative array of plural rules in pluralform => rule format. * @return int The index of the plural form which passed the evaluation */ public static function evaluateCompiled( $number, array $rules ) { + // Calculate the values of the operand symbols + $number = strval( $number ); + if ( !preg_match( '/^ -? ( ([0-9]+) (?: \. ([0-9]+) )? )$/x', $number, $m ) ) { + wfDebug( __METHOD__.': invalid number input, returning "other"' ); + return count( $rules ); + } + if ( !isset( $m[3] ) ) { + $operandSymbols = array( + 'n' => intval( $m[1] ), + 'i' => intval( $m[1] ), + 'v' => 0, + 'w' => 0, + 'f' => 0, + 't' => 0 + ); + } else { + $absValStr = $m[1]; + $intStr = $m[2]; + $fracStr = $m[3]; + $operandSymbols = array( + 'n' => floatval( $absValStr ), + 'i' => intval( $intStr ), + 'v' => strlen( $fracStr ), + 'w' => strlen( rtrim( $fracStr, '0' ) ), + 'f' => intval( $fracStr ), + 't' => intval( rtrim( $fracStr, '0' ) ), + ); + } + // The compiled form is RPN, with tokens strictly delimited by // spaces, so this is a simple RPN evaluator. foreach ( $rules as $i => $rule ) { @@ -76,8 +107,8 @@ class CLDRPluralRuleEvaluator { $nine = ord( '9' ); foreach ( StringUtils::explode( ' ', $rule ) as $token ) { $ord = ord( $token ); - if ( $token === 'n' ) { - $stack[] = $number; + if ( isset( $operandSymbols[$token] ) ) { + $stack[] = $operandSymbols[$token]; } elseif ( $ord >= $zero && $ord <= $nine ) { $stack[] = intval( $token ); } else { @@ -91,8 +122,8 @@ class CLDRPluralRuleEvaluator { return $i; } } - // None of the provided rules match. The number belongs to caregory - // 'other' which comes last. + // None of the provided rules match. The number belongs to category + // 'other', which comes last. return count( $rules ); } @@ -251,35 +282,35 @@ class CLDRPluralRuleEvaluator_Range { */ class CLDRPluralRuleConverter { /** - * The rule + * The input string * * @var string */ public $rule; /** - * The position + * The current position * * @var int */ public $pos; /** - * The last position possible + * The past-the-end position * * @var int */ public $end; /** - * The operators + * The operator stack * * @var array */ public $operators = array(); /** - * The operands + * The operand stack * * @var array */ @@ -311,14 +342,19 @@ class CLDRPluralRuleConverter { /** * Same for digits. Note that the grammar given in UTS #35 doesn't allow - * negative numbers or decimals. + * negative numbers or decimal separators. */ const NUMBER_CLASS = '0123456789'; + /** + * A character list of symbolic operands. + */ + const OPERAND_SYMBOLS = 'nivwft'; + /** * An anchored regular expression which matches a word at the current offset. */ - const WORD_REGEX = '/[a-zA-Z]+/A'; + const WORD_REGEX = '/[a-zA-Z@]+/A'; /** * Convert a rule to RPN. This is the only public entry point. @@ -425,17 +461,19 @@ class CLDRPluralRuleConverter { return $token; } - // Comma - if ( $this->rule[$this->pos] === ',' ) { - $token = $this->newOperator( ',', $this->pos, 1 ); - $this->pos ++; + // Two-character operators + $op2 = substr( $this->rule, $this->pos, 2 ); + if ( $op2 === '..' || $op2 === '!=' ) { + $token = $this->newOperator( $op2, $this->pos, 2 ); + $this->pos += 2; return $token; } - // Dot dot - if ( substr( $this->rule, $this->pos, 2 ) === '..' ) { - $token = $this->newOperator( '..', $this->pos, 2 ); - $this->pos += 2; + // Single-character operators + $op1 = $this->rule[$this->pos]; + if ( $op1 === ',' || $op1 === '=' || $op1 === '%' ) { + $token = $this->newOperator( $op1, $this->pos, 1 ); + $this->pos ++; return $token; } @@ -474,13 +512,21 @@ class CLDRPluralRuleConverter { return $token; } - // The special numerical keyword "n" - if ( $word1 === 'n' ) { - $token = $this->newNumber( 'n', $this->pos ); + // The single-character operand symbols + if ( strpos( self::OPERAND_SYMBOLS, $word1 ) !== false ) { + $token = $this->newNumber( $word1, $this->pos ); $this->pos ++; return $token; } + // Samples + if ( $word1 === '@integer' || $word1 === '@decimal' ) { + // Samples are like comments, they have no effect on rule evaluation. + // They run from the first sample indicator to the end of the string. + $this->pos = $this->end; + return false; + } + $this->error( 'unrecognised word' ); } @@ -624,6 +670,15 @@ class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment 'r' => 'range', ); + /** + * Map for converting the new operators introduced in Rev 33 to the old forms + */ + static $aliasMap = array( + '%' => 'mod', + '!=' => 'not-in', + '=' => 'in' + ); + /** * Initialize a new instance of a CLDRPluralRuleConverter_Operator object * @@ -634,6 +689,9 @@ class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment */ function __construct( $parser, $name, $pos, $length ) { parent::__construct( $parser, $pos, $length ); + if ( isset( self::$aliasMap[$name] ) ) { + $name = self::$aliasMap[$name]; + } $this->name = $name; } diff --git a/tests/phpunit/languages/utils/CLDRPluralRuleEvaluatorTest.php b/tests/phpunit/languages/utils/CLDRPluralRuleEvaluatorTest.php index 6abd09f284..71d32c3f87 100644 --- a/tests/phpunit/languages/utils/CLDRPluralRuleEvaluatorTest.php +++ b/tests/phpunit/languages/utils/CLDRPluralRuleEvaluatorTest.php @@ -81,6 +81,56 @@ class CLDRPluralRuleEvaluatorTest extends MediaWikiTestCase { array( 0, 'n in 3..10,13..19', 13, 'scottish rule - ranges with comma' ), array( 0, '5 mod 3 is n', 2, 'n as result of mod - no need to pass' ), + + # Revision 33 new operand examples + # expected, rule, number, comment + array( 0, 'i is 1', '1.00', 'new operand i' ), + array( 0, 'v is 2', '1.00', 'new operand v' ), + array( 0, 'w is 0', '1.00', 'new operand w' ), + array( 0, 'f is 0', '1.00', 'new operand f' ), + array( 0, 't is 0', '1.00', 'new operand t' ), + + array( 0, 'i is 1', '1.30', 'new operand i' ), + array( 0, 'v is 2', '1.30', 'new operand v' ), + array( 0, 'w is 1', '1.30', 'new operand w' ), + array( 0, 'f is 30', '1.30', 'new operand f' ), + array( 0, 't is 3', '1.30', 'new operand t' ), + + array( 0, 'i is 1', '1.03', 'new operand i' ), + array( 0, 'v is 2', '1.03', 'new operand v' ), + array( 0, 'w is 2', '1.03', 'new operand w' ), + array( 0, 'f is 3', '1.03', 'new operand f' ), + array( 0, 't is 3', '1.03', 'new operand t' ), + + # Revision 33 new operator aliases + # expected, rule, number, comment + array( 0, 'n % 3 is 1', 7, 'new % operator' ), + array( 0, 'n = 1,3,5', 3, 'new = operator' ), + array( 1, 'n != 1,3,5', 5, 'new != operator' ), + + # Revision 33 samples + # expected, rule, number, comment + array( 0, 'n in 1,3,5@integer 3~10, 103~110, 1003, … @decimal 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 103.0, 1003.0, …', 3, 'samples' ), + + # Revision 33 some test cases from CLDR + array( 0, 'i = 1 and v = 0 or i = 0 and t = 1', '0.1', 'pt one' ), + array( 0, 'i = 1 and v = 0 or i = 0 and t = 1', '0.01', 'pt one' ), + array( 0, 'i = 1 and v = 0 or i = 0 and t = 1', '0.10', 'pt one' ), + array( 0, 'i = 1 and v = 0 or i = 0 and t = 1', '0.010', 'pt one' ), + array( 0, 'i = 1 and v = 0 or i = 0 and t = 1', '0.100', 'pt one' ), + array( 1, 'i = 1 and v = 0 or i = 0 and t = 1', '0.0', 'pt other' ), + array( 1, 'i = 1 and v = 0 or i = 0 and t = 1', '0.2', 'pt other' ), + array( 1, 'i = 1 and v = 0 or i = 0 and t = 1', '10.0', 'pt other' ), + array( 1, 'i = 1 and v = 0 or i = 0 and t = 1', '100.0', 'pt other' ), + array( 0, 'v = 0 and i % 10 = 2..4 and i % 100 != 12..14 or f % 10 = 2..4 and f % 100 != 12..14', '2', 'bs few' ), + array( 0, 'v = 0 and i % 10 = 2..4 and i % 100 != 12..14 or f % 10 = 2..4 and f % 100 != 12..14', '4', 'bs few' ), + array( 0, 'v = 0 and i % 10 = 2..4 and i % 100 != 12..14 or f % 10 = 2..4 and f % 100 != 12..14', '22', 'bs few' ), + array( 0, 'v = 0 and i % 10 = 2..4 and i % 100 != 12..14 or f % 10 = 2..4 and f % 100 != 12..14', '102', 'bs few' ), + array( 0, 'v = 0 and i % 10 = 2..4 and i % 100 != 12..14 or f % 10 = 2..4 and f % 100 != 12..14', '0.2', 'bs few' ), + array( 0, 'v = 0 and i % 10 = 2..4 and i % 100 != 12..14 or f % 10 = 2..4 and f % 100 != 12..14', '0.4', 'bs few' ), + array( 0, 'v = 0 and i % 10 = 2..4 and i % 100 != 12..14 or f % 10 = 2..4 and f % 100 != 12..14', '10.2', 'bs few' ), + array( 1, 'v = 0 and i % 10 = 2..4 and i % 100 != 12..14 or f % 10 = 2..4 and f % 100 != 12..14', '10.0', 'bs other' ), + ); return $tests; -- 2.20.1